Load Packages

#install.packages("tidyverse")
library(tidyverse)
#install.packages("moderndive")
library(moderndive)
#install.packages("skimr")
library(skimr)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("dplyr")
library(dplyr)
#install.packages("readr")
library(readr)
#install.packages("tidyr")
library(knitr)
#install.packages("infer")
library(infer)

About the Dataset The National Health Interview Survey (NHIS) has monitored the health of the nation since 1957. This is one of the data sets provided by the National Cardiovascular Disease (CVD) Surveillance System. This data set aims to provide a comprehensive picture of the public health burden of CVDs and associated risk factors in the United States. The data are organized by location (region) and indicator, and they include CVDs (e.g., heart failure) and risk factors (e.g., hypertension).

Reference: Data Provided by Centers for Disease Control and Prevention, National Center for Chronic Disease Prevention and Health Promotion, Division of Health Disease and Stroke Prevention (DHDSP), National Cardiovascular Disease Surveillance System. https://chronicdata.cdc.gov/Heart-Disease-Stroke-Prevention/Behavioral-Risk-Factor-Surveillance-System-BRFSS-N/ikwk-8git Accessed on: 1st December 2021

Import Data set:

CVD_data <- read.csv("C:/Shivangi/Michigan/Courses/Fall 2021/SI 544/Final Project/Behavioral_Risk_Factor_Surveillance_System__BRFSS__-__National_Cardiovascular_Disease_Surveillance_Data.csv")

About Data Elements and Data Description:
Year: year
LocationAbbr: Location abbreviation
LocationDesc: Location description
DataSource: Data source
PriorityArea1: Priority Area (Million Hearts or None)
PriorityArea2: Priority Area (ABCS or None)
PriorityArea3: Priority Area (Healthy People 2020 or None)
PriorityArea4: Priority Area(AHA 2020 Goals: Cardiovascular Health Metrics or None)
Category: Category description
Topic: Topic description
Indicator: Indicator description
Data_Value_Type: Data Value Type (mean,rate,percentage)
Data_Value_Unit: Data Value Unit (%, rate per 100,000, etc.)
Data_Value: Data Value (point estimate)
Data_Value_Alt: Equal to data value, but formatting is numeric
Data_Value_Footnote_Symbol:Symbol that would be used to flag footnotes
Data_Value_Footnote: Footnote description
LowConfidenceLimit: 95% confidence interval lower bound
HighConfidenceLimit: 95% confidence interval upper bound
Break_Out_Category: Break out category description
Break_Out: Break out group description
CategoryId: Category lookup value
TopicId: Topic lookup value
IndicatorID: Indicator lookup value Data_Value_TypeID: Data value type lookup value
BreakOutCategoryId: Break out category lookup value
BreakOutId: Break out group lookup value
LocationID: Location lookup value
Geolocation:Geolocation

View the Data Set:

#Let us first view the data set to get a better idea of the columns and data types of the columns. We can use the head function to inspect the first few rows of the data set.
View(head(CVD_data))
#We observe that there are 30 columns in the Cardiovascular Diseases (CVD) data set.

Clean Data Set/Removing Rows with Missing Values:

#We can see that multiple rows contain "na" values, so let us clean our data set or omit the not applicable values.
CVD_cleaned <- na.omit(CVD_data)
View(CVD_cleaned)

Count number of Observations in Data Set:

#Let us count the number of rows in the original data set and the cleaned data set.
CVD_count <- nrow(CVD_data)
print(CVD_count)
## [1] 126256
CVD_cleaned_count <- nrow(CVD_cleaned)
print(CVD_cleaned_count)
## [1] 79481
#Now, we see that the number of rows after removing "na" values has reduced to approximately 80,000. 

Merging a secondary data set:

#Let us now merge another data set named as Category Score data frame. This data set contains category scroe in binary format where 1 denotes "cardiovascular diseases" and 0 denotes "risk factors".
categoryscore_df <- read.csv("CategoryScore_df.csv")
CVD_merged <- merge(categoryscore_df, CVD_cleaned, by = "Category")
CVD_merged

Selection of columns:

#Now that our data frame is cleaned and merged, let us pick a few columns to work with. We will choose Year, LocationAbbr, LocationDesc, Category, Topic and Indicator to work with. Additionally I will be changing the names of the columns LocationAbbr to Location_Abbreviation and LocationDesc to Location_Description for better understanding.

CVD_df <- CVD_merged %>%
select(Year, LocationAbbr, LocationDesc, Category, Topic, Indicator, Break_Out_Category,Break_Out, Data_Value) %>%
rename(Location_Abbreviation = LocationAbbr, Location_Description  = LocationDesc)
glimpse(CVD_df)
## Rows: 79,481
## Columns: 9
## $ Year                  <int> 2018, 2015, 2015, 2011, 2013, 2015, 2012, 2015, ~
## $ Location_Abbreviation <chr> "ME", "IL", "AR", "OR", "OR", "TN", "NH", "FL", ~
## $ Location_Description  <chr> "Maine", "Illinois", "Arkansas", "Oregon", "Oreg~
## $ Category              <chr> "Cardiovascular Diseases", "Cardiovascular Disea~
## $ Topic                 <chr> "Major Cardiovascular Disease", "Acute Myocardia~
## $ Indicator             <chr> "Prevalence of major cardiovascular disease amon~
## $ Break_Out_Category    <chr> "Age", "Race", "Age", "Age", "Gender", "Race", "~
## $ Break_Out             <chr> "45-64", "Non-Hispanic White", "75+", "35+", "Fe~
## $ Data_Value            <dbl> 10.2, 4.1, 30.8, 9.9, 2.5, 11.5, 5.7, 10.5, 17.3~

Part I: Analyzing Risk Factors associated with CVD:

#Wrangle data using filter() function
riskfactors_category <- CVD_df %>%
  filter(Category == 'Risk Factors')

#Sorting and grouping the risk factors according to state
risk_freq <- riskfactors_category %>%
group_by(Location_Abbreviation) %>%
summarize(num_riskfactors = n())

risk_freq
#Found out that there are 1570 entries for each state. Picking AZ data set containing the risk factors.

riskfactors_AZ <- riskfactors_category %>%
  filter(Location_Abbreviation == "AZ")
riskfactors_AZ

Group by topic and indicator columns and Summarize the data set using skim()

#The "topic" column tells us the type of risk factor for a particular year and the "indicator" column tells us the indicator of the type of risk factor. 
#Let us now now group according to the topic and indicator
by_topic_indicator <- riskfactors_AZ %>%
group_by(Topic, Indicator) %>%
summarize(count = n())

View(by_topic_indicator)

#Before we visualize the data set using a histogram, let us summarize the data set
by_topic_indicator %>% select(Topic, count) %>% skim()
Data summary
Name Piped data
Number of rows 10
Number of columns 2
_______________________
Column type frequency:
numeric 1
________________________
Group variables Topic

Variable type: numeric

skim_variable Topic n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
count Cholesterol Abnormalities 0 1 50 15.56 39 44.5 50 55.5 61 ▇▁▁▁▇
count Diabetes 0 1 155 NA 155 155.0 155 155.0 155 ▁▁▇▁▁
count Hypertension 0 1 80 7.07 75 77.5 80 82.5 85 ▇▁▁▁▇
count Nutrition 0 1 80 NA 80 80.0 80 80.0 80 ▁▁▇▁▁
count Obesity 0 1 153 7.07 148 150.5 153 155.5 158 ▇▁▁▁▇
count Physical Inactivity 0 1 155 NA 155 155.0 155 155.0 155 ▁▁▇▁▁
count Smoking 0 1 147 NA 147 147.0 147 147.0 147 ▁▁▇▁▁
#We observe that Diabetes has the highest mean value of 155 which leads us to drawing an analysis that diabetes potentially is a risk factor to cardiovascular diseases in Arizona.

Visualize using a Histogram

#Let us now build a histogram where we can visualize the count of the types of risk factors in Arizona year-wise.
ggplot(riskfactors_AZ, aes(x = Year)) +
geom_histogram(binwidth = 1, color = "white") +
labs(x = "Count",
y = "Type of Risk Factor",
title = "Histogram of distribution of various risk factors associated with cardiovascular diseases") +
facet_wrap(~ Topic, ncol = 3)

#We notice that diabetes has been a constant risk factor over the years in terms of count whereas obesity as a risk factor declined in 2016 and 2017 but rose back again in 2018 to a higher count.

Part II: Analyzing Types of Cardiovascular Diseases:

#Wrangle data using filter() function
cvd_category <- CVD_df %>%
  filter(Category == 'Cardiovascular Diseases')

#Sorting and grouping the risk factors according to break_out column which contains age, gender, race and overall factors
cvd_freq <- cvd_category %>%
group_by(Break_Out) %>%
summarize(num_cvd = n())

cvd_freq

Visualization through a Barplot

ggplot(cvd_freq, aes(x = Break_Out, y = num_cvd)) +
geom_col(position = "dodge")+
labs(x = "Age, Gender, Race, Overall", y = "CVD Count", title = "Barplot of Relationship of age, gender, race and overall with CVD count")+
theme(axis.text.x = element_text(angle = 90, size = 10))

#We can see that on an average, CVD is a disease prevalent in the Non-Hispanic White population and it is non-prevalent or less prevalent in the Non-Hispanic Asian population.  

#It is also evident that CVD does not have much relation to the gender and is equally likely to occur in both the genders.

Summary of few columns

#Let us take a summary of the 75+ age category.
cvd_summary <- CVD_df %>%
  filter(Break_Out == "75+")
View(cvd_summary)
summary(cvd_summary $ Data_Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.70   22.80   33.09   52.50   99.40
#Thus, it can be seen that 33% of the people of 75+ age are likely to develop cardiovascular diseases.  

Hypothesis Testing:

heart_attack <- CVD_cleaned %>% filter(Indicator == "Prevalence of stroke among US adults (18+); BRFSS", Break_Out == "Non-Hispanic Black" | Break_Out == "Non-Hispanic White")
View(heart_attack)
unique(CVD_cleaned$Indicator)
##  [1] "Prevalence of diabetes among US adults (18+); BRFSS"                                                  
##  [2] "Prevalence of current smoking among US adults (18+); BRFSS"                                           
##  [3] "Prevalence of major cardiovascular disease among US adults (18+); BRFSS"                              
##  [4] "Prevalence of acute myocardial infarction (heart attack) among US adults (18+); BRFSS"                
##  [5] "Prevalence of coronary heart disease among US adults (18+); BRFSS"                                    
##  [6] "Prevalence of healthy weight among US adults (20+); BRFSS"                                            
##  [7] "Prevalence of obesity among US adults (20+); BRFSS"                                                   
##  [8] "Prevalence of stroke among US adults (18+); BRFSS"                                                    
##  [9] "Prevalence of cholesterol screening in the past 5 years among US adults (20+); BRFSS"                 
## [10] "Prevalence of hypertension among US adults (18+); BRFSS"                                              
## [11] "Prevalence of physical inactivity among US adults (18+); BRFSS"                                       
## [12] "Prevalence of high total cholesterol among US adults (20+); BRFSS"                                    
## [13] "Prevalence of consuming fruits and vegetables less than 5 times per day among US adults (18+); BRFSS" 
## [14] "Prevalence of post-hospitalization rehabilitation among heart attack patients, US adults (18+); BRFSS"
## [15] "Prevalence of hypertension medication use among US adults (18+) with hypertension; BRFSS"
heart_attack <- heart_attack %>% filter(Break_Out_Category == "Race")

null_distribution_heart <- heart_attack %>%
specify(formula = Data_Value ~ Break_Out) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("Non-Hispanic Black", "Non-Hispanic White"))
null_distribution_heart
heart_diff_means <- heart_attack %>%
specify(formula = Data_Value ~ Break_Out) %>%
calculate(stat = "diff in means", order = c("Non-Hispanic Black", "Non-Hispanic White"))
heart_diff_means
visualize(null_distribution_heart, bins = 10) +
shade_p_value(obs_stat = heart_diff_means, direction = "both")

null_distribution_heart %>%
get_p_value(obs_stat = heart_diff_means, direction = "both")
#The obtained p-value is less than threshold. The null hypothesis is that cardiovascular disease is not related to race, hence it is rejected.